import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, GridSearchCV
data=pd.read_csv(r'C:\Users\prati\OneDrive\Desktop\Pract_2\Churn_Modelling.csv')
data
# exited is the dependent variable and others are independent
| RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 15634602 | Hargrave | 619 | France | Female | 42 | 2 | 0.00 | 1 | 1 | 1 | 101348.88 | 1 |
| 1 | 2 | 15647311 | Hill | 608 | Spain | Female | 41 | 1 | 83807.86 | 1 | 0 | 1 | 112542.58 | 0 |
| 2 | 3 | 15619304 | Onio | 502 | France | Female | 42 | 8 | 159660.80 | 3 | 1 | 0 | 113931.57 | 1 |
| 3 | 4 | 15701354 | Boni | 699 | France | Female | 39 | 1 | 0.00 | 2 | 0 | 0 | 93826.63 | 0 |
| 4 | 5 | 15737888 | Mitchell | 850 | Spain | Female | 43 | 2 | 125510.82 | 1 | 1 | 1 | 79084.10 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9995 | 9996 | 15606229 | Obijiaku | 771 | France | Male | 39 | 5 | 0.00 | 2 | 1 | 0 | 96270.64 | 0 |
| 9996 | 9997 | 15569892 | Johnstone | 516 | France | Male | 35 | 10 | 57369.61 | 1 | 1 | 1 | 101699.77 | 0 |
| 9997 | 9998 | 15584532 | Liu | 709 | France | Female | 36 | 7 | 0.00 | 1 | 0 | 1 | 42085.58 | 1 |
| 9998 | 9999 | 15682355 | Sabbatini | 772 | Germany | Male | 42 | 3 | 75075.31 | 2 | 1 | 0 | 92888.52 | 1 |
| 9999 | 10000 | 15628319 | Walker | 792 | France | Female | 28 | 4 | 130142.79 | 1 | 1 | 0 | 38190.78 | 0 |
10000 rows × 14 columns
col = ['Geography', 'Gender']
lbe = preprocessing.LabelEncoder()
# This initializes a LabelEncoder object from the preprocessing module in scikit-learn.
for i in col:
data[i] = lbe.fit_transform(data[i])
data
| RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 15634602 | Hargrave | 619 | 0 | 0 | 42 | 2 | 0.00 | 1 | 1 | 1 | 101348.88 | 1 |
| 1 | 2 | 15647311 | Hill | 608 | 2 | 0 | 41 | 1 | 83807.86 | 1 | 0 | 1 | 112542.58 | 0 |
| 2 | 3 | 15619304 | Onio | 502 | 0 | 0 | 42 | 8 | 159660.80 | 3 | 1 | 0 | 113931.57 | 1 |
| 3 | 4 | 15701354 | Boni | 699 | 0 | 0 | 39 | 1 | 0.00 | 2 | 0 | 0 | 93826.63 | 0 |
| 4 | 5 | 15737888 | Mitchell | 850 | 2 | 0 | 43 | 2 | 125510.82 | 1 | 1 | 1 | 79084.10 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9995 | 9996 | 15606229 | Obijiaku | 771 | 0 | 1 | 39 | 5 | 0.00 | 2 | 1 | 0 | 96270.64 | 0 |
| 9996 | 9997 | 15569892 | Johnstone | 516 | 0 | 1 | 35 | 10 | 57369.61 | 1 | 1 | 1 | 101699.77 | 0 |
| 9997 | 9998 | 15584532 | Liu | 709 | 0 | 0 | 36 | 7 | 0.00 | 1 | 0 | 1 | 42085.58 | 1 |
| 9998 | 9999 | 15682355 | Sabbatini | 772 | 1 | 1 | 42 | 3 | 75075.31 | 2 | 1 | 0 | 92888.52 | 1 |
| 9999 | 10000 | 15628319 | Walker | 792 | 0 | 0 | 28 | 4 | 130142.79 | 1 | 1 | 0 | 38190.78 | 0 |
10000 rows × 14 columns
0 is Female and 1 is Male
In exited 1 means customer left the bank and 0 means customer does not left the bank
and label encoding to convert the categorical data into numerical.
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10000 entries, 0 to 9999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 RowNumber 10000 non-null int64 1 CustomerId 10000 non-null int64 2 Surname 10000 non-null object 3 CreditScore 10000 non-null int64 4 Geography 10000 non-null int32 5 Gender 10000 non-null int32 6 Age 10000 non-null int64 7 Tenure 10000 non-null int64 8 Balance 10000 non-null float64 9 NumOfProducts 10000 non-null int64 10 HasCrCard 10000 non-null int64 11 IsActiveMember 10000 non-null int64 12 EstimatedSalary 10000 non-null float64 13 Exited 10000 non-null int64 dtypes: float64(2), int32(2), int64(9), object(1) memory usage: 1015.8+ KB
data.shape
(10000, 14)
print("Number of rows :",data.shape[0])
print("Number of columns :",data.shape[1])
Number of rows : 10000 Number of columns : 14
data.describe() #statistical info of data
# standard deviation (std)
| RowNumber | CustomerId | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10000.00000 | 1.000000e+04 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.00000 | 10000.000000 | 10000.000000 | 10000.000000 |
| mean | 5000.50000 | 1.569094e+07 | 650.528800 | 0.746300 | 0.545700 | 38.921800 | 5.012800 | 76485.889288 | 1.530200 | 0.70550 | 0.515100 | 100090.239881 | 0.203700 |
| std | 2886.89568 | 7.193619e+04 | 96.653299 | 0.827529 | 0.497932 | 10.487806 | 2.892174 | 62397.405202 | 0.581654 | 0.45584 | 0.499797 | 57510.492818 | 0.402769 |
| min | 1.00000 | 1.556570e+07 | 350.000000 | 0.000000 | 0.000000 | 18.000000 | 0.000000 | 0.000000 | 1.000000 | 0.00000 | 0.000000 | 11.580000 | 0.000000 |
| 25% | 2500.75000 | 1.562853e+07 | 584.000000 | 0.000000 | 0.000000 | 32.000000 | 3.000000 | 0.000000 | 1.000000 | 0.00000 | 0.000000 | 51002.110000 | 0.000000 |
| 50% | 5000.50000 | 1.569074e+07 | 652.000000 | 0.000000 | 1.000000 | 37.000000 | 5.000000 | 97198.540000 | 1.000000 | 1.00000 | 1.000000 | 100193.915000 | 0.000000 |
| 75% | 7500.25000 | 1.575323e+07 | 718.000000 | 1.000000 | 1.000000 | 44.000000 | 7.000000 | 127644.240000 | 2.000000 | 1.00000 | 1.000000 | 149388.247500 | 0.000000 |
| max | 10000.00000 | 1.581569e+07 | 850.000000 | 2.000000 | 1.000000 | 92.000000 | 10.000000 | 250898.090000 | 4.000000 | 1.00000 | 1.000000 | 199992.480000 | 1.000000 |
data.columns
Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
'IsActiveMember', 'EstimatedSalary', 'Exited'],
dtype='object')
data.drop(['RowNumber','Surname','CustomerId'],axis=1,inplace=True)
data.shape
(10000, 11)
data.columns
Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
'Exited'],
dtype='object')
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10000 entries, 0 to 9999 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CreditScore 10000 non-null int64 1 Geography 10000 non-null int32 2 Gender 10000 non-null int32 3 Age 10000 non-null int64 4 Tenure 10000 non-null int64 5 Balance 10000 non-null float64 6 NumOfProducts 10000 non-null int64 7 HasCrCard 10000 non-null int64 8 IsActiveMember 10000 non-null int64 9 EstimatedSalary 10000 non-null float64 10 Exited 10000 non-null int64 dtypes: float64(2), int32(2), int64(7) memory usage: 781.4 KB
# numerical columns
numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns
print("Numerical columns:", numerical_columns)
Numerical columns: Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
'IsActiveMember', 'EstimatedSalary', 'Exited'],
dtype='object')
# categorical columns
data_types = data.dtypes
# Filter columns with 'object' or 'category' data type
categorical_columns = data_types[data_types == 'object'].index.tolist()
# .index.tolist(): This extracts the index labels of the selected entries (i.e., the names of the columns)
# and converts them into a list.
# Print the list of categorical columns
print("Categorical columns:", categorical_columns)
Categorical columns: []
there is no categorical data because we have already converted them in numerical
data.isnull()
| CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False | False | False | False |
| 3 | False | False | False | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9995 | False | False | False | False | False | False | False | False | False | False | False |
| 9996 | False | False | False | False | False | False | False | False | False | False | False |
| 9997 | False | False | False | False | False | False | False | False | False | False | False |
| 9998 | False | False | False | False | False | False | False | False | False | False | False |
| 9999 | False | False | False | False | False | False | False | False | False | False | False |
10000 rows × 11 columns
data.isnull().sum()
CreditScore 0 Geography 0 Gender 0 Age 0 Tenure 0 Balance 0 NumOfProducts 0 HasCrCard 0 IsActiveMember 0 EstimatedSalary 0 Exited 0 dtype: int64
as we can see there is no missing values present
data.head(2)
| CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619 | 0 | 0 | 42 | 2 | 0.00 | 1 | 1 | 1 | 101348.88 | 1 |
| 1 | 608 | 2 | 0 | 41 | 1 | 83807.86 | 1 | 0 | 1 | 112542.58 | 0 |
(data[data['Exited']==1].shape[0]/data.shape[0])*100
20.369999999999997
only around 20% of the data is showing churn
churn=data['Exited'].value_counts().reset_index()
churn.head()
# 0 means the customer has not exited and 1 means customer has already exited
| index | Exited | |
|---|---|---|
| 0 | 0 | 7963 |
| 1 | 1 | 2037 |
sns.barplot(x=churn['index'], y=churn['Exited'])
# imbalanced data set
<AxesSubplot:xlabel='index', ylabel='Exited'>
churning=data[data['Exited']==1]
churning.head(2)
| CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619 | 0 | 0 | 42 | 2 | 0.0 | 1 | 1 | 1 | 101348.88 | 1 |
| 2 | 502 | 0 | 0 | 42 | 8 | 159660.8 | 3 | 1 | 0 | 113931.57 | 1 |
def calculate_ratios(ax):
# This line defines a function named
# calculate_ratios that takes one argument, ax, which is assumed to be a Matplotlib Axes object.
l=len(ax.patches)
# calculates the total number of patches (bars) in the given Axes object ax and stores it in the variable l.
cnt=0
# The variable cnt is later used to keep track of the number of iterations in the loop.
without_churn=[]
with_churn=[]
# 2 empty list
for p in ax.patches:
if cnt<l/2:
without_churn.append(p.get_height())
else:
with_churn.append(p.get_height())
cnt+=1
print("without_churn : ",without_churn)
print("with_churn : ",with_churn)
ratio=[]
for i,j in zip(without_churn,with_churn):
ratio.append(j/i)
print("ratio : ",ratio)
data['Geography'].unique()
array([0, 2, 1])
ax=sns.countplot(x=data['Geography'],hue=data['Exited'])
plt.xticks(rotation=90)
(array([0, 1, 2]), [Text(0, 0, '0'), Text(1, 0, '1'), Text(2, 0, '2')])
0 is France 1 is Germany and 2 is Spain
from the above graph, we can see that the most regular customers are from France which are least likely to churn
calculate_ratios(ax)
without_churn : [4204, 1695, 2064] with_churn : [810, 814, 413] ratio : [0.19267364414843008, 0.48023598820059, 0.2000968992248062]
so, far from the above ratio, we have seen that Germany has the maximum ratio. The customers who churn mostly are from Germany the germany and france and spain
plt.figure(figsize=(6,4))
g=sns.countplot(x=data['Gender'],hue=data['Exited'])
calculate_ratios(g)
without_churn : [3404, 4559] with_churn : [1139, 898] ratio : [0.33460634547591067, 0.19697302039921036]
(churning[churning['Gender']==0].shape[0]/data[data['Gender']==0].shape[0])*100
# churn rate percentage of females
25.071538630860662
Female customers are more likely to churn. On the average almost 25% female customer churn.
plt.figure(figsize=(6,4))
sns.kdeplot(x=data['Tenure'],hue=data['Exited'],multiple='stack')
# kernel density estimation plot
<AxesSubplot:xlabel='Tenure', ylabel='Density'>
above graph shows the relationship between tenure and churning. The lesser the tenure, more the chances to churn. so one of the ways to reduce customer churning would be to retain the customer for longer tenure, so it reduces the chance of churning.
dev=sns.countplot(x=data['Tenure'],hue=data['Exited'])
calculate_ratios(dev)
without_churn : [318, 803, 847, 796, 786, 803, 771, 851, 828, 771, 389] with_churn : [95, 232, 201, 213, 203, 209, 196, 177, 197, 213, 101] ratio : [0.29874213836477986, 0.2889165628891656, 0.23730814639905548, 0.2675879396984925, 0.2582697201017812, 0.2602739726027397, 0.25421530479896237, 0.20799059929494712, 0.23792270531400966, 0.27626459143968873, 0.2596401028277635]
customers with 1 year tenure are more likely to churn
status=churning['HasCrCard'].value_counts().reset_index()
status.head()
| index | HasCrCard | |
|---|---|---|
| 0 | 1 | 1424 |
| 1 | 0 | 613 |
Cr=sns.countplot(x=data['HasCrCard'],hue=data['Exited'])
calculate_ratios(Cr)
without_churn : [2332, 5631] with_churn : [613, 1424] ratio : [0.26286449399656947, 0.25288581069081867]
the customer who does not have credit cards are more likely to churn
status=sns.countplot(x=data['IsActiveMember'],hue=data['Exited'])
calculate_ratios(status)
without_churn : [3547, 4416] with_churn : [1302, 735] ratio : [0.3670707640259374, 0.16644021739130435]
The customers who are not an active members are more likely to churn
data['NumOfProducts'].unique()
array([1, 3, 2, 4], dtype=int64)
productpref=sns.barplot(x=data['NumOfProducts'],y=data['IsActiveMember'],hue=data['Exited'])
plt.xticks(rotation=90)
(array([0, 1, 2, 3]), [Text(0, 0, '1'), Text(1, 0, '2'), Text(2, 0, '3'), Text(3, 0, '4')])
calculate_ratios(productpref)
without_churn : [0.5654421768707483, 0.5445544554455446, 0.6086956521739131, nan] with_churn : [0.34421575585521647, 0.39080459770114945, 0.38636363636363635, 0.48333333333333334] ratio : [0.6087550061443313, 0.7176593521421107, 0.6347402597402597, nan]
sns.lineplot(x=data['NumOfProducts'],y=data['IsActiveMember'],hue=data['Exited'])
# shadow around the lines represents the confidence interval (CI) of the estimated mean values
# The shade typically indicates the degree of uncertainty associated with the mean estimate at each point along the x-axis.
<AxesSubplot:xlabel='NumOfProducts', ylabel='IsActiveMember'>
from the above graph, we can see the most of the customers that churn are having more products. This shows that the organisation or the company should focus on long term relations with customer to provide them consistent quality products.
plt.figure(figsize=(16,10))
sns.histplot(x=data['CreditScore'],hue=data['Exited'],bins=100)
# bidding concept for credit score
# because the value is very close to each other and conjested looking graph and does not convey any thing
# bidding is based on frequency and count so when you group them this will clear every thing
<AxesSubplot:xlabel='CreditScore', ylabel='Count'>
the customer exited(1) is very low which is either they have low credit score because of their past transactions they would pass to repayment of the lower capacity is very less so they have churned the most and even the customers who have a good credit score they are less likely to charge
data.groupby('Gender')['IsActiveMember'].value_counts()
Gender IsActiveMember
0 1 2284
0 2259
1 1 2867
0 2590
Name: IsActiveMember, dtype: int64
sns.barplot(x=data['Gender'],y=data['IsActiveMember'], hue=data['Exited'])
# line on the top is error bars indicates the uncertainty or variability in the data
<AxesSubplot:xlabel='Gender', ylabel='IsActiveMember'>
nearly both men and women are active members, not so much gender bias
data.columns
Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
'Exited'],
dtype='object')
correlation = data['CustomerId'].corr(data['Exited']) print("Correlation between CustomerID and Exited:", correlation)
the value is -0.0006.........something thats why we drop it in the begining
data.dtypes
CreditScore int64 Geography int32 Gender int32 Age int64 Tenure int64 Balance float64 NumOfProducts int64 HasCrCard int64 IsActiveMember int64 EstimatedSalary float64 Exited int64 dtype: object
checking the inconsistent data values in each columns
data.head(5)
| CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619 | 0 | 0 | 42 | 2 | 0.00 | 1 | 1 | 1 | 101348.88 | 1 |
| 1 | 608 | 2 | 0 | 41 | 1 | 83807.86 | 1 | 0 | 1 | 112542.58 | 0 |
| 2 | 502 | 0 | 0 | 42 | 8 | 159660.80 | 3 | 1 | 0 | 113931.57 | 1 |
| 3 | 699 | 0 | 0 | 39 | 1 | 0.00 | 2 | 0 | 0 | 93826.63 | 0 |
| 4 | 850 | 2 | 0 | 43 | 2 | 125510.82 | 1 | 1 | 1 | 79084.10 | 0 |
# Isactivemember is grouped according to the hascrcard variables and the dependent variable is examined.
data.groupby(["IsActiveMember", "HasCrCard"]).agg({"Exited":"count"})
| Exited | ||
|---|---|---|
| IsActiveMember | HasCrCard | |
| 0 | 0 | 1401 |
| 1 | 3448 | |
| 1 | 0 | 1544 |
| 1 | 3607 |
checking the number of people who are active or not and has credit card or not and yet has exited or not
# Isactivemember is grouped according to hascrcard variables and the balance variable is examined.
data.groupby(["IsActiveMember", "HasCrCard"]).agg({"Balance" : "mean"})
| Balance | ||
|---|---|---|
| IsActiveMember | HasCrCard | |
| 0 | 0 | 77825.424525 |
| 1 | 76853.588646 | |
| 1 | 0 | 78007.318381 |
| 1 | 74962.849983 |
people who are active member and has cresdit card with balance
# The balance variable was examined according to the gender variable.
data.groupby("Gender").agg({"Balance": "mean"})
| Balance | |
|---|---|
| Gender | |
| 0 | 75659.369139 |
| 1 | 77173.974506 |
Male and female and there balance
# How many people whose balance is 0 and do not leave?
data[(data["Balance"] == 0) & (data["Exited"] == 0)]
| CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 699 | 0 | 0 | 39 | 1 | 0.0 | 2 | 0 | 0 | 93826.63 | 0 |
| 6 | 822 | 0 | 1 | 50 | 7 | 0.0 | 2 | 1 | 1 | 10062.80 | 0 |
| 11 | 497 | 2 | 1 | 24 | 3 | 0.0 | 2 | 1 | 0 | 76390.01 | 0 |
| 12 | 476 | 0 | 0 | 34 | 10 | 0.0 | 2 | 1 | 0 | 26260.98 | 0 |
| 13 | 549 | 0 | 0 | 25 | 5 | 0.0 | 2 | 0 | 0 | 190857.79 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9988 | 775 | 0 | 1 | 30 | 4 | 0.0 | 2 | 1 | 0 | 49337.84 | 0 |
| 9989 | 841 | 2 | 1 | 28 | 4 | 0.0 | 2 | 1 | 1 | 179436.60 | 0 |
| 9992 | 726 | 2 | 1 | 36 | 2 | 0.0 | 1 | 1 | 0 | 195192.40 | 0 |
| 9994 | 800 | 0 | 0 | 29 | 2 | 0.0 | 2 | 0 | 0 | 167773.55 | 0 |
| 9995 | 771 | 0 | 1 | 39 | 5 | 0.0 | 2 | 1 | 0 | 96270.64 | 0 |
3117 rows × 11 columns
how many people whose balance is 0 yet has not churn
# How many people whose balance is 0 leave?
data[(data["Balance"] == 0) & (data["Exited"] == 1)]
| CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 619 | 0 | 0 | 42 | 2 | 0.0 | 1 | 1 | 1 | 101348.88 | 1 |
| 22 | 510 | 2 | 0 | 38 | 4 | 0.0 | 1 | 1 | 0 | 118913.53 | 1 |
| 30 | 591 | 2 | 0 | 39 | 3 | 0.0 | 3 | 1 | 0 | 140469.38 | 1 |
| 58 | 511 | 2 | 0 | 66 | 4 | 0.0 | 1 | 1 | 0 | 1643.11 | 1 |
| 81 | 777 | 0 | 0 | 32 | 2 | 0.0 | 1 | 1 | 0 | 136458.19 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9784 | 527 | 0 | 1 | 39 | 4 | 0.0 | 2 | 1 | 0 | 167183.07 | 1 |
| 9884 | 751 | 0 | 0 | 48 | 4 | 0.0 | 1 | 0 | 1 | 30165.06 | 1 |
| 9898 | 589 | 0 | 1 | 38 | 4 | 0.0 | 1 | 1 | 0 | 95483.48 | 1 |
| 9962 | 702 | 2 | 1 | 44 | 9 | 0.0 | 1 | 0 | 0 | 59207.41 | 1 |
| 9997 | 709 | 0 | 0 | 36 | 7 | 0.0 | 1 | 0 | 1 | 42085.58 | 1 |
500 rows × 11 columns
people with 0 balance who has churn
# Access to the correlation of the data set was provided. What kind of relationship is examined between the variables.
# If the correlation value is> 0, there is a positive correlation. While the value of one variable increases, the value of the other variable also increases.
# Correlation = 0 means no correlation.
# If the correlation is <0, there is a negative correlation. While one variable increases, the other variable decreases.
# When the correlations are examined, there are 1 variables that act as a positive correlation to the exited dependent variable.
# This variable is Age. As this increases, the Result variable increases.
data.corr()
| CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| CreditScore | 1.000000 | 0.007888 | -0.002857 | -0.003965 | 0.000842 | 0.006268 | 0.012238 | -0.005458 | 0.025651 | -0.001384 | -0.027094 |
| Geography | 0.007888 | 1.000000 | 0.004719 | 0.022812 | 0.003739 | 0.069408 | 0.003972 | -0.008523 | 0.006724 | -0.001369 | 0.035943 |
| Gender | -0.002857 | 0.004719 | 1.000000 | -0.027544 | 0.014733 | 0.012087 | -0.021859 | 0.005766 | 0.022544 | -0.008112 | -0.106512 |
| Age | -0.003965 | 0.022812 | -0.027544 | 1.000000 | -0.009997 | 0.028308 | -0.030680 | -0.011721 | 0.085472 | -0.007201 | 0.285323 |
| Tenure | 0.000842 | 0.003739 | 0.014733 | -0.009997 | 1.000000 | -0.012254 | 0.013444 | 0.022583 | -0.028362 | 0.007784 | -0.014001 |
| Balance | 0.006268 | 0.069408 | 0.012087 | 0.028308 | -0.012254 | 1.000000 | -0.304180 | -0.014858 | -0.010084 | 0.012797 | 0.118533 |
| NumOfProducts | 0.012238 | 0.003972 | -0.021859 | -0.030680 | 0.013444 | -0.304180 | 1.000000 | 0.003183 | 0.009612 | 0.014204 | -0.047820 |
| HasCrCard | -0.005458 | -0.008523 | 0.005766 | -0.011721 | 0.022583 | -0.014858 | 0.003183 | 1.000000 | -0.011866 | -0.009933 | -0.007138 |
| IsActiveMember | 0.025651 | 0.006724 | 0.022544 | 0.085472 | -0.028362 | -0.010084 | 0.009612 | -0.011866 | 1.000000 | -0.011421 | -0.156128 |
| EstimatedSalary | -0.001384 | -0.001369 | -0.008112 | -0.007201 | 0.007784 | 0.012797 | 0.014204 | -0.009933 | -0.011421 | 1.000000 | 0.012097 |
| Exited | -0.027094 | 0.035943 | -0.106512 | 0.285323 | -0.014001 | 0.118533 | -0.047820 | -0.007138 | -0.156128 | 0.012097 | 1.000000 |
# Correlation Matrix
f, ax = plt.subplots(figsize= [12,8])
sns.heatmap(data.corr(), annot=True, fmt=".2f", ax=ax, cmap = "magma" )
ax.set_title("Correlation Matrix", fontsize=20)
plt.show()
# The distribution of the dependent variable in the dataset is plotted as pie and columns graphs.
f, ax = plt.subplots(1, 2, figsize=(18, 8))
data['Exited'].value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
ax[0].set_title('Distribution')
ax[0].set_ylabel('')
sns.countplot(x='Exited', data=data, ax=ax[1])
ax[1].set_title('Exited')
plt.show()
# Plotted the categorical variables on the basis of the graph of the column according to the dependent variable.
fig, axarr = plt.subplots(2, 2, figsize=(20, 12))
sns.countplot(x='Geography', hue = 'Exited',data = data, ax=axarr[0][0])
sns.countplot(x='Gender', hue = 'Exited',data = data, ax=axarr[0][1])
sns.countplot(x='HasCrCard', hue = 'Exited',data = data, ax=axarr[1][0])
sns.countplot(x='IsActiveMember', hue = 'Exited',data = data, ax=axarr[1][1])
<AxesSubplot:xlabel='IsActiveMember', ylabel='count'>
# Dependent variable was plotted according to age and geography variable.
import plotly.express as px
fig = px.bar(data,y = "Exited", x = "Age" , color = "Geography")
fig.show()
import plotly.express as px
labels = ['Female-Not Exited', 'Female-Exited', 'Male-Not Exited', 'Male-Exited']
Gender = data.groupby('Gender')['Exited'].value_counts().reset_index(name='count')
fig = px.pie(Gender, values='count', names=labels, title='Exited by Gender', hole=0.5)
fig.show()
# Boxplot graph for outlier observation analysis
fig, axarr = plt.subplots(3, 2, figsize=(20, 12))
sns.boxplot(y='CreditScore',x = 'Exited', hue = 'Exited',data = data, ax=axarr[0][0])
sns.boxplot(y='Age',x = 'Exited', hue = 'Exited',data = data , ax=axarr[0][1])
sns.boxplot(y='Tenure',x = 'Exited', hue = 'Exited',data = data, ax=axarr[1][0])
sns.boxplot(y='Balance',x = 'Exited', hue = 'Exited',data = data, ax=axarr[1][1])
sns.boxplot(y='NumOfProducts',x = 'Exited', hue = 'Exited',data = data, ax=axarr[2][0])
sns.boxplot(y='EstimatedSalary',x = 'Exited', hue = 'Exited',data = data, ax=axarr[2][1])
<AxesSubplot:xlabel='Exited', ylabel='EstimatedSalary'>
# Outlier Observation Analysis
for feature in data[['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
'Exited']]:
Q1 = data[feature].quantile(0.25)
Q3 = data[feature].quantile(0.75)
IQR = Q3-Q1
lower = Q1- 1.5*IQR
upper = Q3 + 1.5*IQR
if data[(data[feature] > upper)].any(axis=None):
print(feature,"yes")
else:
print(feature, "no")
CreditScore no Geography no Gender no Age yes Tenure no Balance no NumOfProducts yes HasCrCard no IsActiveMember no EstimatedSalary no Exited yes
models = {
'Logistic Regression': LogisticRegression(max_iter=10000),
'Support Vector Machine': SVC(),
'Random Forest': RandomForestClassifier(),
'Decision Tree': DecisionTreeClassifier(),
'K-Nearest Neighbors': KNeighborsClassifier()
}
x, y = data.drop('Exited', axis=1), data['Exited']
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# n_splits=1 indicates you want only one split.
# test_size=0.2 specifies that 20% of the data will be used for testing.
# random_state=42 sets the random seed for reproducibility.
for train_index, test_index in sss.split(x, y):
x_train, x_test = x.loc[train_index], x.loc[test_index]
y_train, y_test = y.loc[train_index], y.loc[test_index]
# The .loc accessor in pandas allows us to select data based on labels of rows and/or columns.
x_train.shape, x_test.shape, y_train.shape, y_test.shape
((8000, 10), (2000, 10), (8000,), (2000,))
x_train.head()
| CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2151 | 753 | 0 | 1 | 57 | 7 | 0.00 | 1 | 1 | 0 | 159475.08 |
| 8392 | 739 | 1 | 1 | 32 | 3 | 102128.27 | 1 | 1 | 0 | 63981.37 |
| 5006 | 755 | 1 | 0 | 37 | 0 | 113865.23 | 2 | 1 | 1 | 117396.25 |
| 4117 | 561 | 0 | 1 | 37 | 5 | 0.00 | 2 | 1 | 0 | 83093.25 |
| 7182 | 692 | 1 | 1 | 49 | 6 | 110540.43 | 2 | 0 | 1 | 107472.99 |
for name, model in models.items():
model.fit(x_train, y_train)
# model.fit(x_train, y_train): This fits the model to the training data (x_train, y_train), where x_train contains the features and y_train contains the target variable.
y_pred = model.predict(x_test)
# make predictions on the test data (x_test), producing predicted values for the target variable.
accuracy = accuracy_score(y_test, y_pred)
# comparing y_test and y_pred for accuracy
print(f'{name} - Training Accuracy: {accuracy * 100}%')
# print(f'{name} - Training Accuracy: {accuracy * 100}%'): This prints out the name of the model along
# with its training accuracy, where the accuracy is multiplied by 100 to convert it to a percentage.
Logistic Regression - Training Accuracy: 79.65% Support Vector Machine - Training Accuracy: 79.65% Random Forest - Training Accuracy: 86.15% Decision Tree - Training Accuracy: 78.60000000000001% K-Nearest Neighbors - Training Accuracy: 76.4%
# for name, model in models.items():
# print(f'{name} - classification_report: \n{classification_report(y_test, model.predict(x_test))}')
from sklearn.metrics import classification_report
for name, model in models.items():
# Fit the model to the training data
model.fit(x_train, y_train)
# Generate predictions on the test set
y_pred = model.predict(x_test)
# Print the classification report for the model
print(f'{name} - classification_report: \n{classification_report(y_test, y_pred, zero_division=1)}')
print()
Logistic Regression - classification_report:
precision recall f1-score support
0 0.80 1.00 0.89 1593
1 1.00 0.00 0.00 407
accuracy 0.80 2000
macro avg 0.90 0.50 0.44 2000
weighted avg 0.84 0.80 0.71 2000
Support Vector Machine - classification_report:
precision recall f1-score support
0 0.80 1.00 0.89 1593
1 1.00 0.00 0.00 407
accuracy 0.80 2000
macro avg 0.90 0.50 0.44 2000
weighted avg 0.84 0.80 0.71 2000
Random Forest - classification_report:
precision recall f1-score support
0 0.87 0.96 0.92 1593
1 0.77 0.46 0.58 407
accuracy 0.86 2000
macro avg 0.82 0.71 0.75 2000
weighted avg 0.85 0.86 0.85 2000
Decision Tree - classification_report:
precision recall f1-score support
0 0.87 0.86 0.86 1593
1 0.47 0.48 0.47 407
accuracy 0.78 2000
macro avg 0.67 0.67 0.67 2000
weighted avg 0.79 0.78 0.78 2000
K-Nearest Neighbors - classification_report:
precision recall f1-score support
0 0.80 0.94 0.86 1593
1 0.26 0.08 0.13 407
accuracy 0.76 2000
macro avg 0.53 0.51 0.49 2000
weighted avg 0.69 0.76 0.71 2000
for name, model in models.items():
kfold = KFold(n_splits=5, random_state=42, shuffle=True)
print(f'{name} - Cross Validation Score: {round(cross_val_score(model, x_test, y_test, cv=kfold).mean() * 100,3)}%')
# cross_val_score function automatically handles the splitting of the data into training and
# testing sets for each fold of cross-validation.
Logistic Regression - Cross Validation Score: 78.7% Support Vector Machine - Cross Validation Score: 79.65% Random Forest - Cross Validation Score: 85.7% Decision Tree - Cross Validation Score: 79.75% K-Nearest Neighbors - Cross Validation Score: 76.4%